1 Univ. Grenoble Alpes, CEA, CNRS, INRAE, IRIG, Laboratoire de Physiologie Cellulaire & Végétale, 38000 Grenoble.
2 Univ. Grenoble-Alpes, CNRS, Jardin du Lautaret, 38000, Grenoble, France.
3 Univ. Grenoble-Alpes, Univ. Savoie Mont Blanc, CNRS, LECA, 38000, Grenoble, France.

@ Correspondence: Eric Maréchal <>, Eric Coissac <>

Go back to the global description of the project

1 Setting up the R environment

1.1 Loading of the R libraries

  • ROBITools package is used to read result files produced by OBITools.

  • ROBITaxonomy package provides function allowing to query OBITools formated taxonomy.

if (!require(ROBITools)) {

  # ROBITools are not available on CRAN and have to be installed
  # from http://git.metabarcoding.org using devtools

  if (!require(devtools)) {
    install.packages("devtools")
  }

  devtools::install_git("https://git.metabarcoding.org/obitools/ROBIUtils.git")
  devtools::install_git("https://git.metabarcoding.org/obitools/ROBITaxonomy.git")
  devtools::install_git("https://git.metabarcoding.org/obitools/ROBITools.git")

  library(ROBITools)
}
## Loading required package: ROBITools
## ROBITools package
library(ROBITaxonomy)
## 
## Attaching package: 'ROBITaxonomy'
## The following object is masked from 'package:stats':
## 
##     family
  • tidyverse provides various method for efficient data manipulation and plotting via ggplot2
if (!require(tidyverse)) {
  install.packages("tidyverse")
  library(tidyverse)
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
  • vegan is loaded for its decostand function
if (!require(vegan)) {
  install.packages("vegan")
  library(vegan)
}
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:ROBITools':
## 
##     rarefy
  • ade4 provides multivariate analysis functions
if (!require(ade4)) {
  install.packages("ade4")
  library(ade4)
}
## Loading required package: ade4
  • matrixStats is loaded for its weightedMedian function
if (!require(matrixStats)) {
  install.packages("matrixStats")
  library(matrixStats)
}
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
  • sfsmisc is loaded for its pretty10exp function
if (!require(sfsmisc)) {
  install.packages("sfsmisc")
  library(sfsmisc)
}
## Loading required package: sfsmisc
## 
## Attaching package: 'sfsmisc'
## The following object is masked from 'package:dplyr':
## 
##     last
  • GGally is loaded for its ggpairs function
if (!require(GGally)) {
  install.packages("GGally")
  library(GGally)
}
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
  • ggrepel is loaded for its geom_text_repel function
if (!require(ggrepel)) {
  install.packages("ggrepel")
  library(ggrepel)
}
## Loading required package: ggrepel
  • gridExtra is loaded for its grid.arrange function
if (!require(gridExtra)) {
  install.packages("gridExtra")
  library(gridExtra)
}
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
  • robustreg is loaded for its robustRegBS function
if (!require(robustreg)) {
  install.packages("robustreg")
  library(robustreg)
}
## Loading required package: robustreg
  • ggpubr is loaded for its ggarrange function
if (!require(ggpubr)) {
  install.packages("ggpubr")
  library(ggpubr)
}
## Loading required package: ggpubr
  • gg.gap is loaded for its gg.gap function
if (!require(gg.gap)) {
  install.packages("gg.gap")
  library(gg.gap)
}
## Loading required package: gg.gap

1.2 Utility functions

source("utils._func.R")

1.2.1 Data standardization functions

1.2.1.1 Function converting the sites list to a factor ordored according to their median elevation.

site_factor <- function(x) {
  levels1 <- names(tapply(euka03@samples$Elevation, 
                          euka03@samples$Site_short, 
                          median) %>% sort())
  xf <- factor(x, levels = levels1)
  xf
}

1.2.1.2 Function translating environment names from French to English.

milieu_factor <- function(x) {
  f <- as.character(x) %>% 
       factor(levels = c("Milieu forestier", 
                         "Milieu ouvert"))
  levels(f) <- c("Forest", "Open area")
  f
}

1.2.1.3 Function substituting soil horizon from abbreviations to full names.

horizon_factor <- function(x) {
  f <- as.character(x) %>% 
       factor(levels = c("LIT", "SOL"))
  levels(f) <- c("Litter", "Topsoil")
  f
}

1.2.1.4 Function renaming environmental variables.

variable_factor <- function(x) {
  f <- as.character(x) %>% 
       factor(levels = c("Elevation", "pH", 
                         "Organic_matter_2mm", 
                         "Carbon", "Nitrogen", 
                         "cn_ratio"))
  levels(f) <- c("Elevation", "pH", 
                 "Organic matter", 
                 "Carbon", "Nitrogen", 
                 "C/N Ratio")
  f
}

1.2.2 Graphical functions

1.2.2.1 theme_paper defines the ggplot theme for the figures

theme_paper <- function() {
  theme_classic(base_size = 9) +
    theme(
      strip.background = element_rect(linetype = "blank"),
      panel.spacing = unit(1, "lines")
    )
}

1.2.2.2 Functions to save graphics for publication

Functions wrapping the ggplot ggsave function to produce TIFF and PDF files according to the Frontiers recommendations

  • write_figure_1_col : for the one column figures
  • write_figure_2_cols : for the two columns figures
write_figure <- function(name, width = 180, height = 90, dpi = 300) {
  dir.create("Figures", showWarnings = FALSE)
  ggsave(sprintf("Figures/%s.pdf", name), 
         units = "mm", width = width, 
         height = height, dpi = dpi)
  ggsave(sprintf("Figures/%s.tiff", name), 
         units = "mm", width = width, 
         height = height, dpi = dpi)
}

write_figure_1_col <- function(name, height = 90, dpi = 300) {
  write_figure(name, 85, height, dpi)
}

write_figure_2_cols <- function(name, height = 90, dpi = 300) {
  write_figure(name, 180, height, dpi)
}

1.2.3 Statistical functions

1.2.3.1 vif estimates the Variance Inflation Factor.

Function estimating the Variance Inflation Factor (VIF). The function estimates VIF for every columns of a data.frame

VIF estimates how much a explanatory variable can be expressed as a linear combination of the others. It is commonly admited to remove variables with a $ VIF>5$.

\[ VIF = \frac{1}{1-R^2} \]

with \(R^2\) the adjusted \(R^2\) of the linear regression of one explanatory variable by the others.

vif <- function(data) {
  n <- ncol(data)

  r2 <- sapply(
    1:n,
    function(i) {
      summary(lm(as.formula(data[, c(i, (1:n)[-i])]),
        data = data
      ))$adj.r.squared
    }
  )
  v <- 1 / (1 - r2)
  names(v) <- colnames(data)
  sort(v, decreasing = TRUE)
}

1.2.3.2 rtest_niche estimates the specialisation of a niche.

The niche is defined following the OMI model implemented in the ade4::niche function.

rtest_niche implements a double permutation test on :

  • the marginality measure (\(H_1\) : the marginality is different from \(0\))
  • the tolerance (\(H_1\) tolerance is smaller than in a uniform random distribution)

The test is implemented following the same permutation procedure used to test marginality in the ade4::rtest function.

rtest_niche <- function(xtest, nrepet = 99, ...) {
  if (!inherits(xtest, "dudi")) {
    stop("Object of class dudi expected")
  }
  if (!inherits(xtest, "niche")) {
    stop("Type 'niche' expected")
  }

  appel <- as.list(xtest$call)
  X <- eval.parent(appel$dudiX)$tab

  Y <- eval.parent(appel$Y)
  w1 <- colSums(Y)
  if (any(w1 <= 0)) {
    stop(paste("Column sum <=0 in Y"))
  }
  Y <- sweep(Y, 2, w1, "/")

  calcul_niche <- function(freq, mil) {
    m <- colSums(freq * mil)
    mil <- t(t(mil) - m)
    tolt <- sum(freq * mil * mil)
    u <- m / sqrt(sum(m^2))
    z <- mil %*% u
    c(sum(m^2), sum(freq * z * z))
  }

  obs_niche <- apply(Y, 2, calcul_niche, mil = X)

  #    obs <- c(obs, Tolm.mean = mean(obs))
  sim_niche <- lapply(
    1:nrepet,
    function(x) {
      apply(apply(Y, 2, sample),
        2,
        calcul_niche,
        mil = X
      )
    }
  )

  better_mid <- rowSums(sapply(1:length(sim_niche), 
                               function(i) sim_niche[[i]][1, ] >= obs_niche[1, ]))
  better_tol <- rowSums(sapply(1:length(sim_niche), 
                               function(i) sim_niche[[i]][2, ] <= obs_niche[2, ]))
  better <- mapply(min, better_mid, better_tol)

  (better + 1) / (nrepet + 1)
}

1.2.3.3 motus_gradient estimates the distribution range of MOTUs along a gradient.

Considering an environmental gradient that function estimates :

  • The pic of density of each MOTU along the gradient. That value is estimated as the mean of the gradient weighted by the read relative frequency of the considered MOTU in the samples.

  • The limits of the niche, centred on the pic of density, and estimated from the tolerance parameter (the variance of the gradient weighted by the read relative frequency of the MOTU).

  • Estimated \(p_value\) of that niche using the rtest_niche function.

Parameters :

  • data : a metabarcoding.data instance.
  • gradient : a numeric vector with one value per sample.

returns

A data.frame of ten columns :

  • motu : the MOTU name
  • gradient_weight : the weigthed median of the gradient value for the MOTU
  • range_low : the lower limit of the gradient range
  • range_high : the higher limit of the gradient range
  • pval : the p-value as returned by the rtest_niche function.
  • var : variance of the gradient.
  • sd : square root of var.
niche_motus_gradient <- function(data, gradient, nrepet = 999) {
  relfreq <- decostand(data@reads, method = "total")
  motus_normed <- decostand(relfreq, method = "total", MARGIN = 2)

  mean_gradient <- colSums(sweep(motus_normed, 
                                 MARGIN = 1, 
                                 STATS = gradient, FUN = "*"))
  var_gradient <- colSums(outer(gradient, 
                                mean_gradient, 
                                "-")^2 * motus_normed)
  data.frame(
    motu = colnames(data),
    gradient_weight = mean_gradient,
    range_low = mean_gradient - 1.96 * sqrt(var_gradient),
    range_high = mean_gradient + 1.96 * sqrt(var_gradient),
    pval = rtest_niche(niche(dudi.pca(data.frame(gradient),
      scannf = FALSE
    ),
    as.data.frame(relfreq),
    scannf = FALSE
    ), nrepet = 999),
    var = var_gradient,
    sd = sqrt(var_gradient)
  )
}

1.2.3.4 plot_motus_gradient plots the result of the motus_gradient function

That function draws a map of the MOTUs distribution along a gradient. It use the result of the motus_gradient function as input.

Parameters :

  • data : a metabarcoding.data instance.
  • gradient : a numeric vector with one value per sample.
  • motus_distribution:
  • motus_order:
  • alpha_risk:
  • jitter:
plot_motus_gradient <- function(data, gradient,
                                motus_distribution,
                                motus_order = motus_distribution$gradient_weight,
                                alpha_risk = 0.05,
                                only_h1 = FALSE,
                                jitter = 0.01) {
  jitter <- jitter * sd(gradient)

  motus_distribution$raw_pval <- motus_distribution$pval
  motus_distribution$pval <- p.adjust(motus_distribution$pval,
                                      method = "fdr")

  relfreq <- decostand(data@reads, method = "total")

  if (only_h1) {
    relfreq <- relfreq[, motus_distribution$pval < alpha_risk]
  }

  urank <- function(data) {
    r <- rank(data)
    ru <- unique(r)
    rur <- rank(ru)
    names(rur) <- ru
    rep <- rur[as.character(r)]
    names(rep) <- NULL
    rep
  }

  motus_distribution <- motus_distribution[order(motus_order), ]
  if (only_h1) {
    motus_distribution <- motus_distribution[motus_distribution$pval < alpha_risk, ]
  }

  motu_labels <- mapply(
    function(m, pval, p) {
      if (round(pval, 2) <= alpha_risk) {
        bquote(atop(textstyle(bolditalic(.(m))), 
                    textstyle(P[value] == .(p))))
      } else {
        bquote(atop(textstyle(italic(.(m))), 
                    textstyle(P[value] == .(p))))
      }
    },
    motus_distribution$motu,
    motus_distribution$pval,
    pretty10exp(motus_distribution$pval, drop.1 = TRUE, digits = 2)
  )

  cbind(
    data.frame(
      gradient = gradient,
      pcr = rownames(data),
      Site = site_factor(data@samples$Site_short)
    ),
    as.data.frame(relfreq)
  ) %>%
    pivot_longer(data = ., 
                 cols = names(.)[-c(1:3)], 
                 names_to = "motu") %>%
    group_by(gradient, Site, pcr) %>%
    mutate(g = cur_group_id()) %>%
    group_by(gradient) %>%
    mutate(g = urank(g) * jitter) %>%
    left_join(motus_distribution, by = "motu") %>%
    mutate(motu = factor(motu, 
                         levels = motus_distribution$motu[order(motus_distribution$gradient_weight)])) %>%
    arrange(gradient) %>%
    ggplot(aes(x = gradient - g, 
               xend = gradient - g, 
               y = 0, 
               yend = value * 100, 
               col = Site)) +
    facet_wrap(vars(motu),
      switch = "y",
      labeller = function(variable, value) motu_labels[value],
      ncol = 2, dir = "v"
    ) +
    geom_rect(aes(
      xmin = range_low, xmax = range_high,
      ymin = 0, ymax = 100 * as.numeric(pval < alpha_risk),
      alpha = 0.03
    ),
    fill = grey(0.8), col = 0, show.legend = FALSE
    ) +
    geom_segment(aes(x = gradient_weight, 
                     xend = gradient_weight, 
                     y = 30, yend = 100),
      col = rgb(0, 0, 0, 0.2),
      arrow = arrow(length = unit(3, "mm"), 
                    ends = "first", type = "closed")
    ) +
    geom_segment() +
    geom_point(aes(y = 0.1), cex = 0.3) +
    theme_paper() +
    theme(
      strip.text.y.left = element_text(
        angle = 0,
        size = 6,
        lineheight = 1,
        vjust = 0.5
      ),
      axis.text = element_text(size = 6),
      panel.spacing = unit(.4, "lines")
    ) +
    scale_y_log10(limits = c(0.1, 100)) +
    xlim(
      min(motus_distribution$range_low),
      max(motus_distribution$range_high)
    ) +
    ylab("Relative read frequencies (%)")
}

2 Loading the data

Processed data files cand be produced using the code present in the following notebooks :

2.1 Loading of the Euka03 cleaned dataset

euka03_motus <- read.csv2("cleaned_datasets/Euka03.cleaned.motus.csv",
  header = TRUE, row.names = 1)

euka03_samples <- read.csv2("cleaned_datasets/Euka03.cleaned.samples.csv",
  header = TRUE, row.names = 1) %>% 
  rename(Elevation = Altitude, Elev_groups = Alt_groups)

euka03_reads <- read.csv2("cleaned_datasets/Euka03.cleaned.reads.csv",
  header = TRUE, row.names = 1) %>% 
  as.matrix()

euka03 <- metabarcoding.data(
  reads = euka03_reads,
  samples = euka03_samples,
  motus = euka03_motus
)

rm(euka03_motus, euka03_samples, euka03_reads)

2.2 Loading of the Chlo01 cleaned dataset

chlo01_motus <- read.csv2("cleaned_datasets/Chlo01.cleaned.motus.csv",
  header = TRUE, row.names = 1
)

chlo01_samples <- read.csv2("cleaned_datasets/Chlo01.cleaned.samples.csv",
  header = TRUE, row.names = 1
) %>% rename(Elevation = Altitude, Elev_groups = Alt_groups)

chlo01_reads <- read.csv2("cleaned_datasets/Chlo01.cleaned.reads.csv",
  header = TRUE, row.names = 1
) %>% as.matrix()

chlo01 <- metabarcoding.data(
  reads = chlo01_reads,
  samples = chlo01_samples,
  motus = chlo01_motus
)

rm(chlo01_motus, chlo01_samples, chlo01_reads)

2.3 Loading of the Chlo02 cleaned dataset

chlo02_motus <- read.csv2("cleaned_datasets/Chlo02.cleaned.motus.csv",
  header = TRUE, row.names = 1
)

chlo02_samples <- read.csv2("cleaned_datasets/Chlo02.cleaned.samples.csv",
  header = TRUE, row.names = 1
) %>% rename(Elevation = Altitude, Elev_groups = Alt_groups)

chlo02_reads <- read.csv2("cleaned_datasets/Chlo02.cleaned.reads.csv",
  header = TRUE, row.names = 1
) %>% as.matrix()

chlo02 <- metabarcoding.data(
  reads = chlo02_reads,
  samples = chlo02_samples,
  motus = chlo02_motus
)

rm(chlo02_motus, chlo02_samples, chlo02_reads)

2.4 Loading of the NCBI taxonomy

The ncbi taxonomy have to be previously formated using the obitaxonomy command from OBITools.

ncbi <- read.taxonomy("embl-140/ncbi20190930")

3 Environmental variables selection and preparation

We’ll consider a set of 6 environmental variables assessed during the soil sampling.

3.1 Selection of the usable variables

3.1.1 Preparing the environmental data

Estimates the \(\alpha\text{-diversity} \; ^{1}D\) of samples

chlo01@samples$D_1 <- apply(chlo01@reads, MARGIN = 1, D_q)
environmental <- chlo01@samples %>%
  select(which(sapply(., is.numeric)), 
         Horizon, Milieu, Site = Site_short) %>%
  rename(Environment = Milieu) %>%
  mutate(
    Site = site_factor(Site),
    Environment = milieu_factor(Environment),
    Horizon = horizon_factor(Horizon)
  ) %>%
  # select_if(is.numeric) %>%
  select(-rep_pcr, -X, -Elev_groups, 
         -dist_barycenter, -chlorophyta_part, -D_1) %>%
  select(-starts_with("EEA_")) %>%
  mutate(cn_ratio = Carbon / Nitrogen)

3.1.2 Relationship between numerical and categorial environmental data

environmental %>%
  pivot_longer(
    cols = c("Horizon", "Environment", "Site"),
    names_to = "Category",
    values_to = "Modality"
  ) %>%
  pivot_longer(.,
    cols = which(sapply(., is.numeric)),
    names_to = "Variable",
    values_to = "Value"
  ) %>%
  mutate(Variable = variable_factor(Variable)) %>%
  ggplot(aes(x = Modality, y = Value)) +
  geom_boxplot() +
  facet_grid(Variable ~ Category, scales = "free") +
  theme(axis.text.x = element_text(
    angle = 20,
    hjust = 0.8
  ))

Keeps an renames only numerical environmental variables

environmental <- environmental %>%
  select(`Elevation`,
    `pH`,
    `Organic matter` = Organic_matter_2mm,
    `Carbon`,
    `Nitrogen`,
    `C/N Ratio` = cn_ratio
  )

The code below produces the supplementary Figure S1

environmental %>% ggpairs(
  progress = FALSE,
  aes(col = site_factor(chlo01@samples$Site_short)),
  alpha = 0.5,
  cex = 0.7,
  upper = list(continuous = wrap(ggally_cor, size = 2)),
  diag = list(continuous = wrap("densityDiag", alpha = 0.5), 
              combo = "box"),
  lower = list(continuous = wrap(ggally_points, size = 1))
) +
  theme(
    axis.text = element_text(size = 6),
    panel.spacing = unit(0.7, "lines")
  )

write_figure_2_cols("Figure_S1", height = 180)

3.1.3 Selection of a non-correlated subset of environmental variables

  • First round of selection
vif(environmental)
##         Carbon Organic matter       Nitrogen      C/N Ratio             pH 
##      49.349640      13.187735      12.869248       9.014369       1.693082 
##      Elevation 
##       1.407871

We remove Carbon

  • Second round of selection
environmental %>%
  select(-Carbon) %>%
  vif()
## Organic matter      C/N Ratio       Nitrogen             pH      Elevation 
##       6.589716       3.731378       3.456281       1.617080       1.402344

We remove Organic_matter_2mm

environmental %>%
  select(-Carbon, -`Organic matter`) %>%
  vif()
## C/N Ratio        pH  Nitrogen Elevation 
##  2.071394  1.621548  1.536338  1.406755

Every VIF are below five.

environmental %>% select(-Carbon, -`Organic matter`) -> selected_env
selected_env %>% ggpairs(
  progress = FALSE,
  aes(col = site_factor(chlo01@samples$Site_short))
)

4 Chlorophyta corresponds to a small fraction of environmental DNA

Chlorophyta DNA represents a small part of the of the extracted environmental DNA. It can be explained by a low capacity to extract algeae DNA from soil samples, and/or by a low biomass of algeae in soil. It as to be noticed that the used DNA extraction method target extracellular DNA.

Based on the Euka03 marker we can estimate the relative aboundance of fungi, plants and Chlorophyta in soil extracellular DNA.

Produces the Figure 2.

euka03@samples %>%
  select(Site = Site_short, ends_with("_part")) %>%
  rownames_to_column(var = "sample") %>%
  filter(chlorophyta_part > 0) %>%
  pivot_longer(
    cols = ends_with("_part"),
    names_to = "Taxonomic group", values_to = "fraction"
  ) %>%
  mutate(
    `Taxonomic group` = str_replace(`Taxonomic group`, 
                                    "_part", "") %>%
      str_to_title(),
    Site = site_factor(Site)
  ) %>%
  filter(`Taxonomic group` %in% c(
    "Fungi",
    "Streptophyta",
    "Chlorophyta"
  )) %>%
  mutate(`Taxonomic group` = factor(`Taxonomic group`,
    levels = c(
      "Chlorophyta",
      "Streptophyta",
      "Fungi"
    )
  )) %>%
  ggplot(mapping = aes(x = Site, y = fraction * 100, 
                       col = `Taxonomic group`)) +
  # geom_violin()  +
  geom_boxplot() +
  scale_y_log10() +
  xlab("Sampling site") +
  ylab("Euka03 Relative read frequency (%)") +
  theme_paper() +
  theme(
    legend.title = element_blank(),
    legend.text = element_text(size = 6),
    legend.position = "bottom",
    legend.box = "horizontal", legend.margin = margin()
  ) +
  guides(col = guide_legend(
    title.position = NULL,
    title.hjust = 0.5,
    nrow = 1
  ))

write_figure_1_col("Figure_2")

Moreover only 18.7% of the PCR with the Euka03 marker exhibit some Chlorophyta reads.

With the Chlo01 marker, we observed that many sequences was not annotated as Chlorophyta, despite that this marker is supposed to be highly specific to that clade. We can put in relation fraction of Chlorophyta by both the markers Euka03 and Chlo01.

euka03@samples %>%
  select(EXTRACTION.CODE, chlorophyta_part) %>%
  group_by(EXTRACTION.CODE) %>%
  summarise(
    euka03_algeae_part = ifelse(mean(chlorophyta_part),
      mean(chlorophyta_part[chlorophyta_part > 0]),
      0
    ),
    euka03_algeae_positive = sum(chlorophyta_part > 0)
  ) %>%
  inner_join(chlo01@samples %>%
    select(EXTRACTION.CODE, chlorophyta_part) %>%
    group_by(EXTRACTION.CODE) %>%
    summarise(
      chlo01_algeae_part = mean(chlorophyta_part[chlorophyta_part > 0])),
      by = "EXTRACTION.CODE"
  ) %>%
  rename(sample = EXTRACTION.CODE) -> algeae_parts

Because of the low amount of Chlorophyta DNA results are noisy. To tackle that problem we use iteratively reweighted least squares linear regression. In log scale the determination coefficient of the relation is \(R^2 = 25.7\%\)

algeae_parts %>%
  filter(euka03_algeae_positive > 0) %>%
  na.omit() %>%
  as.data.frame() %>%
  robustRegBS(log10(chlo01_algeae_part) ~ log10(euka03_algeae_part),
    data = .,
    anova.table = TRUE
  ) -> chlo01_euka03_algeae_part_lm
## 
## Robust Regression with Bisquare Function
## Convergence achieved after: 8 iterations
## source        SS          df      MS          F 
## model         0.04    1   0.04    10.94 
## error         0.13    54 
## tot           0.17    55 
## rsquared      0.252 
## mse           0.002346813 
## 
## Coefficients:
##                              estimate  std error t value p value
## (Intercept)               -0.01119624 0.05663185   -0.20 0.84402
## log10(euka03_algeae_part)  0.06728985 0.02034279    3.31 0.00168
chlo01_euka03_algeae_part_lm %>% summary()
##              Length Class  Mode   
## coefficients  2     -none- numeric
## weights      56     -none- numeric
## mse           1     -none- numeric

The code below produces the Figure 3

algeae_parts %>%
  filter(euka03_algeae_positive > 0) %>%
  mutate(
    weight = chlo01_euka03_algeae_part_lm$weights,
    euka03_algeae_part = log10(euka03_algeae_part),
    chlo01_algeae_part = log10(chlo01_algeae_part),
    part = chlo01_algeae_part < -1
  ) %>%
  ggplot(aes(
    x = euka03_algeae_part,
    y = chlo01_algeae_part,
    col = weight
  )) +
  geom_point(cex = 0.7) +
  geom_abline(
    slope = chlo01_euka03_algeae_part_lm$coefficients[2],
    intercept = chlo01_euka03_algeae_part_lm$coefficients[1],
    lty = 2
  ) +
  facet_grid(part ~ ., scales = "free_y", space = "free") +
  scale_y_continuous(
    labels = scales::number_format(accuracy = 0.1),
    breaks = seq(-1.5, 0, by = 0.1)
  ) +
  ylab(expression(paste(log[10], "(Chlo01 fraction)"))) +
  xlab(expression(paste(log[10], "(Euka03 fraction)"))) +
  theme_paper() +
  theme(
    strip.background = element_blank(),
    strip.text.y = element_blank()
  )

write_figure_1_col("Figure_3", height = 70)

No relationship can be established between Chlorophyta eDNA aboundances in the samples and any environmental variable measured.

We have to keep in mind that we are dealing with very low amount od DNA and therefore the sampling strategy is perhaps not the best. Consequently we are under sampling the Algeae diversity.

According to the same idea, the more abundant a taxon is in the overall experiment, the more ubiquitous it is.

The code below produces Figure 4

bind_rows(
  tibble(
    banality = ((chlo02@reads %>%
      decostand("total") %>%
      aggregate(by = list(chlo02@samples$Site_short), 
                mean))[, -1] > 0) %>%
      colSums(),
    frequency = (chlo02@reads %>%
      decostand("total") %>%
      aggregate(by = list(chlo02@samples$Site_short), 
                mean))[, -1] %>%
      apply(MARGIN = 2, FUN = max),
    Marker = "Chlo02"
  ),
  tibble(
    banality = ((chlo01@reads %>%
      decostand("total") %>%
      aggregate(by = list(chlo01@samples$Site_short), 
                mean))[, -1] > 0) %>%
      colSums(),
    frequency = (chlo01@reads %>%
      decostand("total") %>%
      aggregate(by = list(chlo01@samples$Site_short), 
                mean))[, -1] %>%
      apply(MARGIN = 2, FUN = max),
    Marker = "Chlo01"
  )
) %>%
  ggplot(aes(x = factor(banality), y = frequency * 100, 
             col = Marker)) +
  geom_boxplot() +
  scale_y_log10() +
  xlab("Number of gradient where a MOTUs occurs") +
  ylab("Relative MOTU frequencies (%)") +
  theme_paper()

write_figure_1_col("Figure_4")

5 Effect of the environmental variables on the Chlorophyta \(\alpha -diversity\)

The following code splits the pH, Nitrogen, C/N ratio, and elevation gradiants into seven discret levels.

chlo01@samples %>%
  filter(Site_short != "CHA") %>%
  mutate(
    Site = site_factor(Site_short),
    Horizon = horizon_factor(Horizon)
  ) %>%
  select(D_1, pH, Nitrogen, 
         cn_ratio = Carbon / Nitrogen, 
         Elevation, Site, Horizon) %>%
  mutate(
    pH_bin = cut(pH, breaks = 7),
    Nitrogen_bin = cut(Nitrogen, breaks = 7),
    cn_ratio_bin = cut(cn_ratio, breaks = 7),
    Elevation_bin = cut(Elevation, breaks = 7)
  ) %>%
  mutate(tmp = str_replace_all(pH_bin, 
                               pattern = "[()\\[\\]]", 
                               replacement = "")) %>%
  separate(col = tmp, 
           into = c("pH_low", "pH_high"), 
           sep = ",") %>%
  mutate(pH_low = as.numeric(pH_low), 
         pH_high = as.numeric(pH_high)) %>%
  mutate(tmp = str_replace_all(Nitrogen_bin, 
                               pattern = "[()\\[\\]]", 
                               replacement = "")) %>%
  separate(col = tmp, 
           into = c("Nitrogen_low", "Nitrogen_high"), 
           sep = ",") %>%
  mutate(Nitrogen_low = as.numeric(Nitrogen_low), 
         Nitrogen_high = as.numeric(Nitrogen_high)) %>%
  mutate(tmp = str_replace_all(cn_ratio_bin, 
                               pattern = "[()\\[\\]]", 
                               replacement = "")) %>%
  separate(col = tmp, 
           into = c("cn_ratio_low", "cn_ratio_high"), 
           sep = ",") %>%
  mutate(cn_ratio_low = as.numeric(cn_ratio_low), 
         cn_ratio_high = as.numeric(cn_ratio_high)) %>%
  mutate(tmp = str_replace_all(Elevation_bin, 
                               pattern = "[()\\[\\]]", 
                               replacement = "")) %>%
  separate(col = tmp, 
           into = c("Elevation_low", "Elevation_high"), 
           sep = ",") %>%
  mutate(Elevation_low = as.numeric(Elevation_low), 
         Elevation_high = as.numeric(Elevation_high)) -> data_diversity

The effect of these discretized gradients on Chlorophyta alpha diversity is tested using the Kruskall Wallis test.

p.adjust(c(
  kruskal.test(D_1 ~ pH_bin, data = data_diversity)$p.value,
  kruskal.test(D_1 ~ Elevation_bin, data = data_diversity)$p.value,
  kruskal.test(D_1 ~ Nitrogen_bin, data = data_diversity)$p.value,
  kruskal.test(D_1 ~ cn_ratio_bin, data = data_diversity)$p.value),
  method = "fdr") %>%
  pretty10exp(drop.1 = TRUE, digits = 2) %>%
  as.list() -> pval

And part of the Chlorophyta alpha diversity variance explained by these gradients is estimated using ANOVA.

c(
  lm(D_1 ~ pH_bin, data = data_diversity) %>% 
    anova() %>% pull(`Sum Sq`) %>% 
    (function(x) x[1] / sum(x))(),
  lm(D_1 ~ Elevation_bin, data = data_diversity) %>% 
    anova() %>% pull(`Sum Sq`) %>% 
    (function(x) x[1] / sum(x))(),
  lm(D_1 ~ Nitrogen_bin, data = data_diversity) %>% 
    anova() %>% pull(`Sum Sq`) %>% 
    (function(x) x[1] / sum(x))(),
  lm(D_1 ~ cn_ratio_bin, data = data_diversity) %>% 
    anova() %>% pull(`Sum Sq`) %>% 
    (function(x) x[1] / sum(x))()
) %>%
  round(2) %>%
  as.list() -> R2

names(pval) <- c("pval_pH", "pval_Elevation", 
                 "pval_Nitrogen", "pval_cn_ratio")
names(R2) <- c("R2_pH", "R2_Elevation", 
               "R2_Nitrogen", "R2_cn_ratio")
pvalR2 <- c(pval, R2)

The following code produces the Figure 5

ggplot(data = data_diversity) +
  geom_point(aes(y = D_1, x = pH, 
                 col = Site, shape = Horizon), cex = 0.8) +
  geom_segment(data = group_by(data_diversity, pH_bin) %>%
    summarise(
      m = mean(D_1),
      sd = sd(D_1) / sqrt(length(D_1)),
      low = mean(pH_low),
      high = mean(pH_high),
      ddl = length(D_1) - 1,
      se_low = m + qt(0.025, ddl) * sd,
      se_high = m + qt(0.975, ddl) * sd,
      delta = (high - low) * 0.05,
      center = (low + high) / 2
    ) -> stat, aes(y = m, yend = m, 
                   x = low + delta, 
                   xend = high - delta), size = 1.5) +
  geom_segment(data = stat, 
               aes(x = center, xend = center, 
                   y = se_low, yend = se_high)) +
  geom_segment(data = stat, 
               aes(x = center - 2 * delta, 
                   xend = center + 2 * delta, 
                   y = se_low, yend = se_low)) +
  geom_segment(data = stat, 
               aes(x = center - 2 * delta, 
                   xend = center + 2 * delta, 
                   y = se_high, yend = se_high)) +
  ylab(bquote("Hill's number" ~ q == 1 ~ "(" ~ phantom()^
    {
      1
    } * D ~ ")")) +
  xlab(bquote("pH" ~ "(" ~ p[value] == .(pval_pH) ~ 
                " ; " ~ R^2 == .(R2_pH) ~ ")",
              where = pvalR2)) +
  theme_paper() +
  theme(axis.title = element_text(size = 8)) -> plot_pH

ggplot(data = data_diversity) +
  geom_point(aes(y = D_1, x = Elevation, col = Site, 
                 shape = Horizon), cex = 0.8) +
  geom_segment(data = group_by(data_diversity, Elevation_bin) %>%
    summarise(
      m = mean(D_1),
      sd = sd(D_1) / sqrt(length(D_1)),
      low = mean(Elevation_low),
      high = mean(Elevation_high),
      ddl = length(D_1) - 1,
      se_low = m + qt(0.025, ddl) * sd,
      se_high = m + qt(0.975, ddl) * sd,
      delta = (high - low) * 0.05,
      center = (low + high) / 2
    ) -> stat, aes(y = m, yend = m, x = low + delta, xend = high - delta), size = 1.5) +
  geom_segment(data = stat, aes(x = center, 
                                xend = center, 
                                y = se_low, yend = se_high)) +
  geom_segment(data = stat, aes(x = center - 2 * delta, 
                                xend = center + 2 * delta, 
                                y = se_low, yend = se_low)) +
  geom_segment(data = stat, aes(x = center - 2 * delta, 
                                xend = center + 2 * delta, 
                                y = se_high, yend = se_high)) +
  ylab(bquote("Hill's number" ~ q == 1 ~ "(" ~ phantom()^
    {
      1
    } * D ~ ")")) +
  xlab(bquote("Elevation" ~ "(" ~ p[value] == .(pval_Elevation) ~ 
                " ; " ~ R^2 == .(R2_Elevation) ~ ")", 
              where = pvalR2)) +
  theme_paper() +
  theme(
    axis.title.x = element_text(size = 8),
    axis.title.y = element_blank()
  ) -> plot_Elevation

ggplot(data = data_diversity) +
  geom_point(aes(y = D_1, x = Nitrogen, 
                 col = Site, shape = Horizon), cex = 0.8) +
  geom_segment(data = group_by(data_diversity, Nitrogen_bin) %>%
    summarise(
      m = mean(D_1),
      sd = sd(D_1) / sqrt(length(D_1)),
      low = mean(Nitrogen_low),
      high = mean(Nitrogen_high),
      ddl = length(D_1) - 1,
      se_low = m + qt(0.025, ddl) * sd,
      se_high = m + qt(0.975, ddl) * sd,
      delta = (high - low) * 0.05,
      center = (low + high) / 2
    ) -> stat, aes(y = m, yend = m, 
                   x = low + delta, 
                   xend = high - delta), size = 1.5) +
  geom_segment(data = stat, 
               aes(x = center, xend = center, 
                   y = se_low, yend = se_high)) +
  geom_segment(data = stat, 
               aes(x = center - 2 * delta, 
                   xend = center + 2 * delta, 
                   y = se_low, yend = se_low)) +
  geom_segment(data = stat, 
               aes(x = center - 2 * delta, 
                   xend = center + 2 * delta, 
                   y = se_high, yend = se_high)) +
  ylab(bquote("Hill's number" ~ q == 1 ~ "(" ~ phantom()^
    {
      1
    } * D ~ ")")) +
  xlab(bquote("Nitrogen" ~ "(" ~ p[value] == .(pval_Nitrogen) ~ " ; " 
              ~ R^2 == .(R2_Nitrogen) ~ ")", where = pvalR2)) +
  theme_paper() +
  theme(axis.title = element_text(size = 8)) -> plot_Nitrogen

p <- pval["cn_ratio"]
ggplot(data = data_diversity) +
  geom_point(aes(y = D_1, x = cn_ratio, 
                 col = Site, shape = Horizon), cex = 0.8) +
  geom_segment(data = group_by(data_diversity, cn_ratio_bin) %>%
    summarise(
      m = mean(D_1),
      sd = sd(D_1) / sqrt(length(D_1)),
      low = mean(cn_ratio_low),
      high = mean(cn_ratio_high),
      ddl = length(D_1) - 1,
      se_low = m + qt(0.025, ddl) * sd,
      se_high = m + qt(0.975, ddl) * sd,
      delta = (high - low) * 0.05,
      center = (low + high) / 2
    ) -> stat, aes(y = m, yend = m, 
                   x = low + delta, 
                   xend = high - delta), size = 1.5) +
  geom_segment(data = stat, 
               aes(x = center, xend = center, 
                   y = se_low, yend = se_high)) +
  geom_segment(data = stat, 
               aes(x = center - 2 * delta, 
                   xend = center + 2 * delta, 
                   y = se_low, yend = se_low)) +
  geom_segment(data = stat, aes(x = center - 2 * delta, 
                                xend = center + 2 * delta, 
                                y = se_high, yend = se_high)) +
  ylab(bquote("Hill's number" ~ q == 1 ~ "(" ~ phantom()^
    {
      1
    } * D ~ ")")) +
  xlab(bquote("C/N ratio" ~ "(" ~ p[value] == .(pval_cn_ratio) ~ " ; " ~ 
                R^2 == .(R2_cn_ratio) ~ ")", where = pvalR2)) +
  theme_paper() +
  theme(
    axis.title.x = element_text(size = 8),
    axis.title.y = element_blank()
  ) -> plot_cn_ratio

ggarrange(NULL, plot_pH, NULL, plot_Elevation, NULL,
  NULL, plot_Nitrogen, NULL, plot_cn_ratio, NULL,
  widths = c(0.15, 1, 0.15, 1, 0.08),
  ncol = 5, nrow = 2,
  common.legend = TRUE, legend = "bottom",
  labels = c("(A)", "", "(B)", "", "", "(C)", "", "(D)", "", "")
)

write_figure_2_cols("figure_5", height = NA)
## Saving 180 x 102 mm image
## Saving 180 x 102 mm image

6 Description of the terrestrial Algae community

6.1 Distribution of the relative abundancies of the algae classes

Detected Chlorophyta belongs four taxonomical classes:

  • Pedinophyceae
  • Ulvophyceae
  • Chlorophyceae
  • Trebouxiophyceae

The relative abundances and diversities of these classes are estimated.

clades <- taxonatrank(ncbi, chlo01@motus$taxid, "class", name = TRUE)

t(apply(chlo01@reads, MARGIN = 1, 
        function(x) H_q(x, q = 1, clades = clades) / H_q(x, q = 1) * 100)) %>%
  as.data.frame() %>%
  rownames_to_column(var = "sample") %>%
  left_join(chlo01@samples, by = "sample") %>%
  pivot_longer(cols = ends_with("ceae"), 
               names_to = "Class", 
               values_to = "Relative entropy") %>%
  left_join(t(apply(chlo01@reads,
    MARGIN = 1,
    function(x) tapply(x, clades, sum) / sum(x)
  ) * 100) %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample") %>%
    pivot_longer(cols = ends_with("ceae"), 
                 names_to = "Class", 
                 values_to = "Read frequency"),
  by = c("sample", "Class")
  ) %>%
  pivot_longer(cols = c("Relative entropy", 
                        "Read frequency"), 
               names_to = "Measure", 
               values_to = "Value") %>%
  na.omit() -> relative_diversity

The following code produces Figure 6

relative_diversity %>%
  filter(Measure == "Read frequency") %>%
  mutate(
    Class = factor(Class, levels = c(
      "Pedinophyceae", "Ulvophyceae",
      "Chlorophyceae", "Trebouxiophyceae"
    )),
    Site = site_factor(Site_short)
  ) %>%
  ggplot(aes(x = Site, y = Value, col = Class)) +
  geom_boxplot(outlier.size = 1) +
  scale_y_sqrt() +
  xlab("Site") +
  ylab("Chlo01 Relative read frequency (%)") +
  theme_paper() +
  theme(
    legend.title = element_blank(),
    legend.text = element_text(size = 6),
    legend.position = "bottom",
    legend.box = "vertical", legend.margin = margin()
  ) +
  guides(col = guide_legend(
    title.position = NULL,
    title.hjust = 0.5,
    nrow = 2
  ))

write_figure_1_col("Figure_6")

The Chlorophyta are mainly composed of Trebouxiophyceae and Chlorophyceae with a clear superiority for Trebouxiophyceae.

6.2 Impact of the environmental variables on the relative abundances of Trebouxiophyceae and Chlorophyceae

THe code below produces Figure 7

sub_figure_labeller <- function(labels) {
  subfig <- LETTERS[1:length(labels)]
  paste0("(", subfig, ")    ", labels)
}

relative_diversity %>%
  select(`Measure`, Value, Class, Site = Site_short, pH, Elevation) %>%
  mutate(Site = site_factor(Site)) %>%
  filter(Class %in% c("Chlorophyceae", 
                      "Trebouxiophyceae") & 
           Measure == "Read frequency") %>%
  filter(`Value` > 0) %>%
  pivot_longer(cols = c("pH", "Elevation"), 
               names_to = "env", 
               values_to = "Gradient") %>%
  mutate(env = factor(env, levels = c("pH", "Elevation"))) %>%
  ggplot() +
  geom_point(aes(
    y = Value, x = Gradient,
    shape = Class, col = Site
  )) +
  geom_smooth(
    mapping = aes(
      y = Value, x = Gradient,
      shape = Class
    ),
    method = "lm", formula = y ~ x, se = FALSE
  ) +
  facet_grid(. ~ env,
    scales = "free_x",
    labeller = as_labeller(sub_figure_labeller)
  ) +
  xlab("Environmental gradient") +
  ylab("Chlo01 Relative read frequency (%)") +
  theme_paper() +
  theme(strip.text = element_text(size = 12))
## Warning: Ignoring unknown aesthetics: shape

write_figure_2_cols("Figure_7")
relative_diversity %>%
  mutate(cn_ratio = Carbon / Nitrogen) %>%
  select(sample, Measure, Value, 
         Class, 
         Site = Site_short, pH, Elevation) %>%
  filter(Class == "Trebouxiophyceae") %>%
  pivot_wider(names_from = "Measure", 
              values_from = "Value") %>%
  filter(`Read frequency` > 0) %>%
  select(-sample) %>%
  na.omit() -> data

lm(`Relative entropy` ~ pH + Elevation + Site, data = data) -> ll

summary(ll)
## 
## Call:
## lm(formula = `Relative entropy` ~ pH + Elevation + Site, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.408  -5.991   2.179   9.383  36.989 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 136.147149   9.894678  13.760  < 2e-16 ***
## pH           -8.791001   1.878142  -4.681 4.32e-06 ***
## Elevation    -0.004454   0.002771  -1.607 0.109091    
## SiteCHA       1.829409   2.620021   0.698 0.485563    
## SiteLOR       0.282918   3.010153   0.094 0.925181    
## SiteRIS     -12.015472   3.362138  -3.574 0.000409 ***
## SiteVCH       3.276751   3.081396   1.063 0.288451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.53 on 302 degrees of freedom
## Multiple R-squared:   0.32,  Adjusted R-squared:  0.3064 
## F-statistic: 23.68 on 6 and 302 DF,  p-value: < 2.2e-16
summary(lm(`Relative entropy` ~ pH + Site, data = data))
## 
## Call:
## lm(formula = `Relative entropy` ~ pH + Site, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.035  -6.274   2.366  10.113  39.637 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 130.0225     9.1551  14.202  < 2e-16 ***
## pH           -9.3094     1.8551  -5.018 8.90e-07 ***
## SiteCHA       2.6886     2.5716   1.045    0.297    
## SiteLOR       0.3585     3.0176   0.119    0.906    
## SiteRIS     -13.8959     3.1602  -4.397 1.52e-05 ***
## SiteVCH       0.8430     2.6905   0.313    0.754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.57 on 303 degrees of freedom
## Multiple R-squared:  0.3141, Adjusted R-squared:  0.3028 
## F-statistic: 27.76 on 5 and 303 DF,  p-value: < 2.2e-16
summary(lm(`Relative entropy` ~ Elevation + Site, data = data))
## 
## Call:
## lm(formula = `Relative entropy` ~ Elevation + Site, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.402  -5.671   2.836  10.001  38.877 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  98.019545   5.807677  16.878  < 2e-16 ***
## Elevation    -0.006682   0.002823  -2.367   0.0186 *  
## SiteCHA      -1.998967   2.573531  -0.777   0.4379    
## SiteLOR       5.035255   2.929898   1.719   0.0867 .  
## SiteRIS     -18.387621   3.178489  -5.785 1.81e-08 ***
## SiteVCH       2.652988   3.182960   0.833   0.4052    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.03 on 303 degrees of freedom
## Multiple R-squared:  0.2706, Adjusted R-squared:  0.2586 
## F-statistic: 22.48 on 5 and 303 DF,  p-value: < 2.2e-16
anova(ll)
anova(ll)$`Sum Sq` / sum(anova(ll)$`Sum Sq`)
## [1] 0.22015741 0.02563475 0.07416259 0.68004525

6.3 Impact of the environmental variables on the communities

6.3.1 Construction of the RDA modele

A redoundancy analysis (RDA) is done to force alignement of Community in the space of selected variables

Read count are transformed according to Hellinger method (square roots of the relative frequencies)

chlo01_communities <- decostand(chlo01@reads, method = "hellinger")

Environmental variables are centred and scaled.

scaled_env <- selected_env %>%
  scale() %>%
  as.data.frame()
chlo01_rda <- rda(chlo01_communities ~ . + Condition(chlo01@samples$Site_short),
  data = scaled_env,
  scale = FALSE
)

A step forward/backward procedure is then run to select the best model.

ordistep(rda(chlo01_communities ~ 1 + 
               Condition(chlo01@samples$Site_short),
  data = scaled_env,
  scale = FALSE
),
scope = formula(chlo01_rda),
permutations = 999, trace = TRUE
) -> chlo01_rda_selected
## 
## Start: chlo01_communities ~ 1 + Condition(chlo01@samples$Site_short) 
## 
##               Df     AIC      F Pr(>F)    
## + `C/N Ratio`  1 -139.83 9.8047  0.001 ***
## + Elevation    1 -138.66 8.6221  0.001 ***
## + pH           1 -136.94 6.8891  0.001 ***
## + Nitrogen     1 -135.24 5.1871  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: chlo01_communities ~ Condition(chlo01@samples$Site_short) + `C/N Ratio` 
## 
##               Df     AIC      F Pr(>F)    
## - `C/N Ratio`  1 -131.99 9.8047  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             Df     AIC      F Pr(>F)    
## + Nitrogen   1 -143.71 5.8054  0.001 ***
## + Elevation  1 -143.08 5.1726  0.001 ***
## + pH         1 -141.50 3.6085  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: chlo01_communities ~ Condition(chlo01@samples$Site_short) + `C/N Ratio` +      Nitrogen 
## 
##               Df     AIC       F Pr(>F)    
## - Nitrogen     1 -139.83  5.8054  0.001 ***
## - `C/N Ratio`  1 -135.24 10.4175  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             Df     AIC      F Pr(>F)    
## + Elevation  1 -148.16 6.3523  0.001 ***
## + pH         1 -145.65 3.8656  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: chlo01_communities ~ Condition(chlo01@samples$Site_short) + `C/N Ratio` +      Nitrogen + Elevation 
## 
##               Df     AIC      F Pr(>F)    
## - Elevation    1 -143.71 6.3523  0.001 ***
## - `C/N Ratio`  1 -143.18 6.8803  0.001 ***
## - Nitrogen     1 -143.08 6.9855  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Df     AIC      F Pr(>F)    
## + pH  1 -150.54 4.2846  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: chlo01_communities ~ Condition(chlo01@samples$Site_short) + `C/N Ratio` +      Nitrogen + Elevation + pH 
## 
##               Df     AIC      F Pr(>F)    
## - pH           1 -148.16 4.2846  0.001 ***
## - `C/N Ratio`  1 -147.77 4.6729  0.001 ***
## - Elevation    1 -145.65 6.7667  0.001 ***
## - Nitrogen     1 -144.91 7.5024  0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Every considered variables are significantly used in the model and therefore the complete model is selected.

The biplot of the selected model constitutes the Figure 8.

gscale <- sqrt(attributes(summary(chlo01_rda_selected))$const) / 2
summary(chlo01_rda_selected)$site %>%
  as.data.frame() %>%
  rownames_to_column(var = "sample") %>%
  left_join(chlo01@samples, by = "sample") %>%
  mutate(
    Environment = milieu_factor(Environment),
    Site = site_factor(Site_short)
  ) %>%
  ggplot() +
  geom_point(aes(x = RDA1, y = RDA2, 
                 col = Site, pch = Environment)) +
  geom_segment(
    data = summary(chlo01_rda_selected)$biplot %>%
      as.data.frame() %>%
      add_rownames(var = "variable"),
    mapping = aes(x = 0, xend = RDA1 * 
                    gscale, y = 0, yend = RDA2 * gscale),
    col = "black",
    arrow = arrow(length = unit(0.10, "cm"), type = "closed")
  ) +
  geom_text_repel(
    data = summary(chlo01_rda_selected)$biplot %>%
      as.data.frame() %>%
      add_rownames(var = "variable") %>%
      mutate(variable = str_replace_all(variable, "`", "")),
    mapping = aes(x = RDA1 * gscale, 
                  y = RDA2 * gscale, 
                  label = variable),
    hjust = 1, vjust = 0, 
    nudge_x = 0.1, nudge_y = 0.05, 
    cex = 3,
    segment.size = 0
  ) +
  theme_paper() -> plot_RDA_12

# gscale <- sqrt(attributes(summary(chlo01_rda_selected))$const)

summary(chlo01_rda_selected)$site %>%
  as.data.frame() %>%
  rownames_to_column(var = "sample") %>%
  left_join(chlo01@samples, by = "sample") %>%
  mutate(
    Environment = milieu_factor(Environment),
    Site = site_factor(Site_short)
  ) %>%
  ggplot() +
  geom_point(aes(x = RDA3, y = RDA4, 
                 col = Site, pch = Environment)) +
  geom_segment(
    data = summary(chlo01_rda_selected)$biplot %>%
      as.data.frame() %>%
      add_rownames(var = "variable"),
    mapping = aes(x = 0, xend = RDA3 * gscale, 
                  y = 0, yend = RDA4 * gscale),
    col = "black",
    arrow = arrow(length = unit(0.10, "cm"), type = "closed")
  ) +
  geom_text_repel(
    data = summary(chlo01_rda_selected)$biplot %>%
      as.data.frame() %>%
      add_rownames(var = "variable"),
    mapping = aes(x = RDA3 * gscale, 
                  y = RDA4 * gscale, label = variable),
    hjust = 0, vjust = 0.5, cex = 3,
    segment.size = 0,
    nudge_x = 0.05, nudge_y = 0.05
  ) +
  theme_paper() -> plot_RDA_34

ggarrange(NULL, plot_RDA_12,
  NULL, plot_RDA_34,
  NULL,
  ncol = 5,
  common.legend = TRUE, legend = "bottom",
  widths = c(0.15, 1, 0.15, 1, 0.08),
  labels = c("(A)", "", "(B)", "", "")
)

write_figure_2_cols("Figures_8", height = 90)

6.3.2 Estimation of the variance partition

Variance partitionning of the community changes by the considered environmental variables is estimated.

The global effect is :

chlo01_rda <- rda(chlo01_communities ~ . + 
                    Condition(chlo01@samples$Site_short),
  data = scaled_env,
  scale = FALSE
)

chlo01_rda
## Call: rda(formula = chlo01_communities ~ Elevation + pH + Nitrogen +
## `C/N Ratio` + Condition(chlo01@samples$Site_short), data = scaled_env,
## scale = FALSE)
## 
##               Inertia Proportion Rank
## Total         0.69723    1.00000     
## Conditional   0.05269    0.07557    4
## Constrained   0.05116    0.07338    4
## Unconstrained 0.59337    0.85105  301
## Inertia is variance 
## 
## Eigenvalues for constrained axes:
##     RDA1     RDA2     RDA3     RDA4 
## 0.029040 0.012031 0.005603 0.004486 
## 
## Eigenvalues for unconstrained axes:
##     PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8 
## 0.07005 0.03102 0.02873 0.02485 0.02304 0.02258 0.02006 0.01773 
## (Showing 8 of 301 unconstrained eigenvalues)
RsquareAdj(chlo01_rda)
## $r.squared
## [1] 0.07337594
## 
## $adj.r.squared
## [1] 0.06325573

Pure effects of each variable are estimated declaring other variables as condition to remove their effects

  • For pH
cov <- cbind(Site = chlo01@samples$Site_short, 
             as.data.frame(scaled_env))
chlo01_rda_pH <- rda(chlo01_communities ~ pH + 
                       Condition(Site + Elevation + Nitrogen + `C/N Ratio`),
  data = cov,
  scale = FALSE
)
RsquareAdj(chlo01_rda_pH)
## $r.squared
## [1] 0.01168719
## 
## $adj.r.squared
## [1] 0.009159832
  • For Elevation
cov <- cbind(Site = chlo01@samples$Site_short, 
             as.data.frame(scaled_env))
chlo01_rda_Elev <- rda(chlo01_communities ~ Elevation + 
                         Condition(Site + pH + Nitrogen + `C/N Ratio`),
  data = cov,
  scale = FALSE
)
RsquareAdj(chlo01_rda_Elev)
## $r.squared
## [1] 0.01845776
## 
## $adj.r.squared
## [1] 0.01608183
  • For Nitrogen
cov <- cbind(Site = chlo01@samples$Site_short, 
             as.data.frame(scaled_env))
chlo01_rda_N <- rda(chlo01_communities ~ Nitrogen + 
                      Condition(Site + pH + Elevation + `C/N Ratio`),
  data = cov,
  scale = FALSE
)
RsquareAdj(chlo01_rda_N)
## $r.squared
## [1] 0.02046451
## 
## $adj.r.squared
## [1] 0.01813346
  • For C/N Ratio
cov <- cbind(Site = chlo01@samples$Site_short, 
             as.data.frame(scaled_env))
chlo01_rda_CN <- rda(chlo01_communities ~ `C/N Ratio` + 
                       Condition(Site + pH + Elevation + Nitrogen),
  data = cov,
  scale = FALSE
)
RsquareAdj(chlo01_rda_CN)
## $r.squared
## [1] 0.01274634
## 
## $adj.r.squared
## [1] 0.01024267

Global analysis was also realized using the Vegan varpart function.

varpart(chlo01_communities,
  ~pH,
  ~Elevation,
  ~Nitrogen,
  ~`C/N Ratio`,
  data = scaled_env,
  scale = FALSE
) -> chlo01_rda_parts
par(mfrow = c(1, 2))

plot(chlo01_rda_parts,
  digit = 2, cex = 0.6, id.size = 0.5, 
  bg = c("grey", "blue", "yellow", "red"),
  alpha = 50,
  Xnames = c("pH", "Elevation", "Nitrogen", "C/N ratio")
)

showvarparts(4,
  cex = 0.6, id.size = 0.5, 
  bg = c("grey", "blue", "yellow", "red"),
  alpha = 50,
  Xnames = c("pH", "Elevation", "Nitrogen", "C/N ratio")
)

# dev.copy(pdf, "Figures/Chlo01_RDA_var_parts.pdf")
  • X1 : pH
  • X2 : Elevation
  • X3 : Nitrogen
  • X4 : C/N ratio
chlo01_rda_parts
## 
## Partition of variance in RDA 
## 
## Call: varpart(Y = chlo01_communities, X = ~pH, ~Elevation, ~Nitrogen,
## ~`C/N Ratio`, data = scaled_env, scale = FALSE)
## 
## Explanatory tables:
## X1:  ~pH
## X2:  ~Elevation
## X3:  ~Nitrogen
## X4:  ~`C/N Ratio` 
## 
## No. of explanatory tables: 4 
## Total variation (SS): 223.11 
##             Variance: 0.69723 
## No. of observations: 321 
## 
## Partition table:
##                             Df R.square Adj.R.square Testable
## [aeghklno] = X1              1  0.02943      0.02638     TRUE
## [befiklmo] = X2              1  0.03856      0.03555     TRUE
## [cfgjlmno] = X3              1  0.02292      0.01986     TRUE
## [dhijkmno] = X4              1  0.03564      0.03261     TRUE
## [abefghiklmno] = X1+X2       2  0.05844      0.05252     TRUE
## [acefghjklmno] = X1+X3       2  0.04900      0.04301     TRUE
## [adeghijklmno] = X1+X4       2  0.05325      0.04729     TRUE
## [bcefgijklmno] = X2+X3       2  0.05854      0.05262     TRUE
## [bdefhijklmno] = X2+X4       2  0.05810      0.05218     TRUE
## [cdfghijklmno] = X3+X4       2  0.05512      0.04917     TRUE
## [abcefghijklmno] = X1+X2+X3  3  0.08038      0.07168     TRUE
## [abdefghijklmno] = X1+X2+X4  3  0.07601      0.06726     TRUE
## [acdefghijklmno] = X1+X3+X4  3  0.07353      0.06476     TRUE
## [bcdefghijklmno] = X2+X3+X4  3  0.07868      0.06996     TRUE
## [abcdefghijklmno] = All      4  0.09805      0.08663     TRUE
## Individual fractions                                         
## [a] = X1 | X2+X3+X4          1               0.01668     TRUE
## [b] = X2 | X1+X3+X4          1               0.02187     TRUE
## [c] = X3 | X1+X2+X4          1               0.01937     TRUE
## [d] = X4 | X1+X2+X3          1               0.01496     TRUE
## [e]                          0              -0.00109    FALSE
## [f]                          0              -0.00190    FALSE
## [g]                          0              -0.00160    FALSE
## [h]                          0               0.00238    FALSE
## [i]                          0               0.00679    FALSE
## [j]                          0              -0.00022    FALSE
## [k]                          0               0.00519    FALSE
## [l]                          0               0.00069    FALSE
## [m]                          0              -0.00062    FALSE
## [n]                          0              -0.00049    FALSE
## [o]                          0               0.00463    FALSE
## [p] = Residuals              0               0.91337    FALSE
## Controlling 2 tables X                                       
## [ae] = X1 | X3+X4            1               0.01559     TRUE
## [ag] = X1 | X2+X4            1               0.01508     TRUE
## [ah] = X1 | X2+X3            1               0.01906     TRUE
## [be] = X2 | X3+X4            1               0.02078     TRUE
## [bf] = X2 | X1+X4            1               0.01997     TRUE
## [bi] = X2 | X1+X3            1               0.02866     TRUE
## [cf] = X3 | X1+X4            1               0.01747     TRUE
## [cg] = X3 | X2+X4            1               0.01778     TRUE
## [cj] = X3 | X1+X2            1               0.01916     TRUE
## [dh] = X4 | X2+X3            1               0.01734     TRUE
## [di] = X4 | X1+X3            1               0.02175     TRUE
## [dj] = X4 | X1+X2            1               0.01474     TRUE
## Controlling 1 table X                                        
## [aghn] = X1 | X2             1               0.01697     TRUE
## [aehk] = X1 | X3             1               0.02316     TRUE
## [aegl] = X1 | X4             1               0.01468     TRUE
## [bfim] = X2 | X1             1               0.02614     TRUE
## [beik] = X2 | X3             1               0.03276     TRUE
## [befl] = X2 | X4             1               0.01957     TRUE
## [cfjm] = X3 | X1             1               0.01663     TRUE
## [cgjn] = X3 | X2             1               0.01707     TRUE
## [cfgl] = X3 | X4             1               0.01656     TRUE
## [dijm] = X4 | X1             1               0.02091     TRUE
## [dhjn] = X4 | X2             1               0.01663     TRUE
## [dhik] = X4 | X3             1               0.02932     TRUE
## ---
## Use function 'rda' to test significance of fractions of interest

7 Heterogeneous distribution of species and genus

The heterogeneous distribution of the MOTUs is studied only for MOTUs that where taxonomically identified at the species or genus levels.

Four gradients are investigated:

7.1 Selection of the MOTUs identified at the species and at the genus level

7.1.1 For species

A subset of the MOTUs annotated at the species level is extracted from the global data set and aggregated to merge every MOTUs annotated by the same species.

chlo01_species <- chlo01[, !is.na(chlo01@motus$species)]
chlo01_species <- aggregate(chlo01_species,
  MARGIN = "motus",
  by = list(chlo01_species@motus$species_name),
  FUN = sum
)

7.1.2 For genera

A subset of the MOTUs annotated at the genus level is extracted from the global data set and aggregated to merge every MOTUs annotated by the same genus.

chlo01_genus <- chlo01[, !is.na(chlo01@motus$genus)]
chlo01_genus <- aggregate(chlo01_genus,
  MARGIN = "motus",
  by = list(chlo01_genus@motus$genus_name),
  FUN = sum
)

7.2 Analysis of MOTUS distribution along elevation gradient

7.2.1 For species

Distribution ranges are estimated using the motus_gradient defined above.

species_elev <- niche_motus_gradient(chlo01_species, 
                                     chlo01_species@samples$Elevation)

The code below produces the supplentary Figure S3

plot_motus_gradient(
  chlo01_species,
  chlo01_species@samples$Elevation,
  species_elev
) + xlab("Elevation of the sample")

write_figure_2_cols("Figure_S3", height = 270)

7.2.2 For genera

genus_elev <- niche_motus_gradient(chlo01_genus, 
                                   chlo01_genus@samples$Elevation)

The code below produces the Figure 9

plot_motus_gradient(
  chlo01_genus,
  chlo01_genus@samples$Elevation,
  genus_elev
) +
  xlab("Elevation of the sample")

write_figure_2_cols("Figure_9", height = 270)

7.3 Analysis of species distribution along pH gradient

7.3.1 For species

species_ph <- niche_motus_gradient(chlo01_species, 
                                   chlo01_species@samples$pH)

The code below produces the supplentary Figure S4

plot_motus_gradient(chlo01_species,
  chlo01_species@samples$pH,
  species_ph,
  jitter = 0.01
) +
  xlab("Soil pH")

write_figure_2_cols("Figure_S4", height = 270)

7.3.2 For genera

genus_ph <- niche_motus_gradient(chlo01_genus, 
                                 chlo01_genus@samples$pH)

The code below produces the supplentary Figure S5

plot_motus_gradient(chlo01_genus,
  chlo01_genus@samples$pH,
  genus_ph,
  jitter = 0.01
) +
  xlab("Soil pH")

write_figure_2_cols("Figure_S5", height = 270)

7.4 Analysis of species distribution along Nitrogen gradient

7.4.1 For species

species_Nitrogen <- niche_motus_gradient(chlo01_species,
                                         chlo01_species@samples$Nitrogen)

The code below produces the supplentary Figure S6

plot_motus_gradient(chlo01_species,
  chlo01_species@samples$Nitrogen,
  species_Nitrogen,
  jitter = 0.01
) +
  xlab("Soil Nitrogen")

write_figure_2_cols("Figure_S6", height = 270)

7.4.2 For genera

genus_Nitrogen <- niche_motus_gradient(chlo01_genus, 
                                       chlo01_genus@samples$Nitrogen)

The code below produces the supplentary Figure S7

plot_motus_gradient(chlo01_genus,
  chlo01_genus@samples$Nitrogen,
  genus_Nitrogen,
  jitter = 0.01
) +
  xlab("Soil Nitrogen")

write_figure_2_cols("Figure_S7", height = 270)

7.5 Analysis of species distribution along C/N ratio gradient

7.5.1 For species

species_cn_ratio <- niche_motus_gradient(
  chlo01_species,
  chlo01_species@samples$Carbon / chlo01_species@samples$Nitrogen
)

The code below produces the supplentary Figure S8

plot_motus_gradient(chlo01_species,
  chlo01_species@samples$Carbon / chlo01_species@samples$Nitrogen,
  species_cn_ratio,
  jitter = 0.01
) +
  xlab("Soil C/N ratio")

write_figure_2_cols("Figure_S8", height = 270)

7.5.2 For genera

genus_cn_ratio <- niche_motus_gradient(
  chlo01_genus,
  chlo01_genus@samples$Carbon / chlo01_genus@samples$Nitrogen
)

The code below produces the supplentary Figure S9

plot_motus_gradient(chlo01_genus,
  chlo01_genus@samples$Carbon / chlo01_genus@samples$Nitrogen,
  genus_cn_ratio,
  jitter = 0.01
) +
  xlab("Soil C/N ratio")

write_figure_2_cols("Figure_S9", height = 270)

7.6 Global analysis of the niche following the for variable

A delimitation of the niche for the species and genus levels MOTUs, considering the four environmental variables, is computed.

env_var <- tibble(
  Elevation = chlo01_genus@samples$Elevation,
  pH = chlo01_genus@samples$pH,
  Nitrogen = chlo01_genus@samples$Nitrogen,
  `CN Ratio` = chlo01_genus@samples$Carbon / chlo01_genus@samples$Nitrogen
)


env_var_pca <- dudi.pca(env_var, scannf = FALSE)

niche_species <- niche(env_var_pca,
  as.data.frame(decostand(chlo01_species@reads, method = "total")),
  scannf = FALSE
)
niche_genus <- niche(env_var_pca,
  as.data.frame(decostand(chlo01_genus@reads, method = "total")),
  scannf = FALSE
)

niche_species_tests <- rtest_niche(niche_species, nrepet = 999)
niche_genus_tests <- rtest_niche(niche_genus, nrepet = 999)

Specialized MOTUs, are selected based on the rtest_niche function.

niche_barycenter_species <- sweep(sweep(niche_species$tab,
  MARGIN = 2,
  STATS = apply(env_var, 2, sd), "*"
),
MARGIN = 2,
STATS = apply(env_var, 2, mean), "+"
) %>%
  mutate(
    name = names(niche_species_tests)[1:nrow(niche_species$tab)],
    class = chlo01_species@motus$class,
    pval = niche_species_tests,
    `taxonomic rank` = "species"
  ) %>%
  filter(p.adjust(pval, method = "fdr") < 0.05)

niche_barycenter_genus <- sweep(sweep(niche_genus$tab,
  MARGIN = 2,
  STATS = apply(env_var, 2, sd), "*"
),
MARGIN = 2,
STATS = apply(env_var, 2, mean), "+"
) %>%
  mutate(
    name = names(niche_genus_tests)[1:nrow(niche_genus$tab)],
    class = chlo01_genus@motus$class,
    pval = niche_genus_tests,
    `taxonomic rank` = "genus"
  ) %>%
  filter(p.adjust(pval, method = "fdr") < 0.05)

niche_barycenter_species %>% 
  bind_rows(niche_barycenter_genus) -> niche_barycenter
niche_barycenter

A principal correspondance analysis on the niche center is realized to compare them.

niche_barycenter %>%
  select(Elevation, pH, Nitrogen, `CN Ratio`) %>%
  dudi.pca(scannf = FALSE) -> niches_pca

The code below produces the Figure 10

niches_pca %>%
  factoextra::fviz_pca_biplot(
    repel = TRUE, labelsize = 2,
    pointsize = 0.5, title = "",
    col.ind = niche_barycenter$class,
    addEllipses = FALSE,
  ) +
  theme(
    axis.title = element_text(size = 8),
    axis.text = element_text(size = 8),
    legend.position = "bottom",
    legend.text = element_text(size = 8),
    legend.title = element_blank()
  ) + guides(col = guide_legend(
    title.position = NULL,
    title.hjust = 0.5,
    nrow = 2
  ))
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

write_figure_1_col("Figure_10", height = NA)
## Saving 85 x 127 mm image
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Saving 85 x 127 mm image
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

We endly test the non-homogeneous representation of Chlorophyceae and Trebouxiophyceae along of the first axis.

wilcox.test(niches_pca$li[niche_barycenter$class %in% 
                            c("Chlorophyceae", "Trebouxiophyceae"), 1] ~
              (niche_barycenter$class[niche_barycenter$clas %in% 
                                        c("Chlorophyceae", 
                                          "Trebouxiophyceae")]))
## 
##  Wilcoxon rank sum exact test
## 
## data:  niches_pca$li[niche_barycenter$class %in% c("Chlorophyceae", "Trebouxiophyceae"), 1] by niche_barycenter$class[niche_barycenter$clas %in% c("Chlorophyceae", "Trebouxiophyceae")]
## W = 39, p-value = 1.243e-06
## alternative hypothesis: true location shift is not equal to 0